{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "03603465",
   "metadata": {},
   "source": [
    "# Movie Ratings Recommendation using Matrix Factorization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "e433f232",
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn as nn\n",
    "from torch.utils.data import Dataset, DataLoader, TensorDataset\n",
    "import torch.nn.functional as F\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1bed806",
   "metadata": {},
   "source": [
    "### Loading Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "d111f235",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv(\"./movie_ratings_dataset/ratings.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "f59f5be6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982703</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981247</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982224</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>47</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964983815</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>50</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964982931</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp\n",
       "0       1        1     4.0  964982703\n",
       "1       1        3     4.0  964981247\n",
       "2       1        6     4.0  964982224\n",
       "3       1       47     5.0  964983815\n",
       "4       1       50     5.0  964982931"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "998ed89d",
   "metadata": {},
   "outputs": [],
   "source": [
    "data['userId'] = data['userId'] - 1\n",
    "data['movieId'] = data['movieId'] - 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "7b08f12a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.drop_duplicates(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "e246471a",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.dropna(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "a92d489d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def fillID(df, col):\n",
    "    m = dict({-1:-1})\n",
    "    new_col = []\n",
    "    for value in df[col].values:\n",
    "        if value in m:\n",
    "            new_col.append(m[value])\n",
    "        else:\n",
    "            new_index = max(m.values()) + 1\n",
    "            m[value] = new_index\n",
    "            new_col.append(new_index)\n",
    "    df[col+\"Index\"] = new_col\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "46aee15a",
   "metadata": {},
   "outputs": [],
   "source": [
    "fillID(data, 'userId')\n",
    "fillID(data, 'movieId')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c54c8012",
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize(data):\n",
    "    \"\"\"\n",
    "    Normalize the data to the range [0, 1].\n",
    "    \n",
    "    Parameters:\n",
    "    data (numpy array): Input data to be normalized.\n",
    "    \n",
    "    Returns:\n",
    "    normalized_data (numpy array): Normalized data.\n",
    "    \"\"\"\n",
    "    min_val = np.min(data)\n",
    "    max_val = np.max(data)\n",
    "    normalized_data = (data - min_val) / (max_val - min_val)\n",
    "    return normalized_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "67d55ae2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# data['rating] = normalize(data['rating'].values)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "938aac9b",
   "metadata": {},
   "source": [
    "### Train/Test Splitting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "63fa7031",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(3)\n",
    "msk = np.random.rand(len(data)) < 0.8 # 80% Train, 20% Test\n",
    "train = data[msk].copy()\n",
    "validation = data[~msk].copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "02a9be19",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(100836, 80450, 20386)"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data), len(train), len(validation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "0c7c5bed",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['userId', 'movieId', 'rating', 'timestamp', 'userIdIndex',\n",
       "       'movieIdIndex'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "d63f2aff",
   "metadata": {},
   "outputs": [],
   "source": [
    "num_users = data['userIdIndex'].nunique()\n",
    "num_movies = data['movieIdIndex'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "c77cc64b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "610 9724\n"
     ]
    }
   ],
   "source": [
    "print(num_users, num_movies)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "bc41b643",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>userId</th>\n",
       "      <th>movieId</th>\n",
       "      <th>rating</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>userIdIndex</th>\n",
       "      <th>movieIdIndex</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982703</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964981247</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>4.0</td>\n",
       "      <td>964982224</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>46</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964983815</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>0</td>\n",
       "      <td>100</td>\n",
       "      <td>5.0</td>\n",
       "      <td>964980868</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   userId  movieId  rating  timestamp  userIdIndex  movieIdIndex\n",
       "0       0        0     4.0  964982703            0             0\n",
       "1       0        2     4.0  964981247            0             1\n",
       "2       0        5     4.0  964982224            0             2\n",
       "3       0       46     5.0  964983815            0             3\n",
       "6       0      100     5.0  964980868            0             6"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1f2e52d1",
   "metadata": {},
   "source": [
    "### Dataset and Dataloader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "46de8f00",
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 5000\n",
    "\n",
    "train_features = torch.LongTensor(train[['userIdIndex', 'movieIdIndex']].values)\n",
    "train_target = torch.Tensor(train[['rating']].values).float()\n",
    "\n",
    "train_ds = TensorDataset(train_features, train_target)\n",
    "dl_train = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4)\n",
    "\n",
    "val_features = torch.LongTensor(validation[['userIdIndex', 'movieIdIndex']].values)\n",
    "val_target = torch.Tensor(validation[['rating']].values).float()\n",
    "\n",
    "val_ds = TensorDataset(val_features, val_target)\n",
    "dl_val = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "f78a2612",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 605,  217],\n",
      "        [  27, 1140],\n",
      "        [ 605,  142],\n",
      "        ...,\n",
      "        [ 433,  915],\n",
      "        [ 437,  873],\n",
      "        [ 291, 1236]])\n",
      "tensor([[3.5000],\n",
      "        [3.5000],\n",
      "        [2.5000],\n",
      "        ...,\n",
      "        [3.5000],\n",
      "        [4.0000],\n",
      "        [3.5000]])\n"
     ]
    }
   ],
   "source": [
    "xb, yb = next(iter(dl_train))\n",
    "print(xb)\n",
    "print(yb)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0773777c",
   "metadata": {},
   "source": [
    "### Matrix Factorization Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "167b4717",
   "metadata": {},
   "outputs": [],
   "source": [
    "class MatrixFactorization(nn.Module):\n",
    "    def __init__(self, num_users, num_items, emb_size=100):\n",
    "        super(MatrixFactorization, self).__init__()\n",
    "        self.user_embedding = nn.Embedding(num_users, emb_size)\n",
    "        self.item_embedding = nn.Embedding(num_items, emb_size)\n",
    "        \n",
    "        # Initializing Embedding Matrices\n",
    "        self.user_embedding.weight.data.uniform_(0, 0.05)\n",
    "        self.item_embedding.weight.data.uniform_(0, 0.05)\n",
    "        \n",
    "    def forward(self, u, v):\n",
    "        u = self.user_embedding(u)\n",
    "        v = self.item_embedding(v)\n",
    "        return (u*v).sum(1)   "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6a7fc6f3",
   "metadata": {},
   "source": [
    "### Training Loop"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "a18a034e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def test(model):\n",
    "    model.eval()\n",
    "    total_loss = []\n",
    "    for indices, ratings in dl_val:\n",
    "        ratings = ratings.squeeze(1)\n",
    "        users, items = indices[:,0], indices[:,1]\n",
    "        output = model(users, items)\n",
    "        loss = F.mse_loss(output, ratings)\n",
    "        total_loss.append(loss.item())\n",
    "    \n",
    "    print(f\"Test Loss: {sum(total_loss) / len(total_loss)}\")\n",
    "    \n",
    "    \n",
    "def trainer(model, num_epochs, lr=0.05):\n",
    "    # Init optimizer\n",
    "    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)\n",
    "    \n",
    "    # Set model in Training mode\n",
    "    model.train()\n",
    "    \n",
    "    # Start training loop\n",
    "    for epoch in range(num_epochs):\n",
    "        total_loss = []\n",
    "        for indices, ratings in dl_train:\n",
    "            ratings = ratings.squeeze(1)\n",
    "            users, items = indices[:,0], indices[:,1]\n",
    "            # Compute model output\n",
    "            output = model(users, items)\n",
    "            # Compute Loss\n",
    "            loss = F.mse_loss(output, ratings)\n",
    "\n",
    "            # Update model weights\n",
    "            optimizer.zero_grad()\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            total_loss.append(loss.item())\n",
    "        \n",
    "        # Print Error\n",
    "        print(f\"{epoch}/{num_epochs} - Loss: {sum(total_loss)/len(total_loss)}\")\n",
    "        \n",
    "        # Test model at regular intervals\n",
    "        if epoch % 10 == 0:\n",
    "            # Test model\n",
    "            test(model)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc57ba94",
   "metadata": {},
   "source": [
    "### Initialize Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "34a87e14",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = MatrixFactorization(num_users, num_movies, emb_size=100)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fc44f012",
   "metadata": {},
   "source": [
    "### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "37d457e8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0/10 - Loss: 4.208172657910516\n",
      "Test Loss: 1.6839283564511467\n",
      "1/10 - Loss: 1.56242457558127\n",
      "2/10 - Loss: 1.1174142641179703\n",
      "3/10 - Loss: 0.9379285048036015\n",
      "4/10 - Loss: 0.8116016598308787\n",
      "5/10 - Loss: 0.637713611125946\n",
      "6/10 - Loss: 0.5123480242841384\n",
      "7/10 - Loss: 0.4736304546103758\n",
      "8/10 - Loss: 0.431275874376297\n",
      "9/10 - Loss: 0.39761781166581545\n"
     ]
    }
   ],
   "source": [
    "trainer(model, num_epochs=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "cbb1d807",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test Loss: 0.3274755968767054\n"
     ]
    }
   ],
   "source": [
    "test(model)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e5225d6a",
   "metadata": {},
   "source": [
    "### Sanity Checks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "aaee3f66",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loss:  0.34479594230651855\n"
     ]
    }
   ],
   "source": [
    "indices, ratings = next(iter(dl_val))\n",
    "ratings = ratings.squeeze(1)\n",
    "users, items = indices[:,0], indices[:,1]\n",
    "output = model(users, items)\n",
    "loss = F.mse_loss(output, ratings)\n",
    "print(\"Loss: \",loss.item())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "3d246977",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([2.5000, 2.5000, 3.0000,  ..., 3.0000, 5.0000, 2.0000])"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ratings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "8c1cb4dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([2.7080, 2.4294, 2.4730,  ..., 3.3969, 5.1638, 1.7656],\n",
       "       grad_fn=<SumBackward1>)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19acd2d0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "torch",
   "language": "python",
   "name": "torch"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
